logo

Motivation

Can we organize the countries of the world by variables we are interested in?

For instance, the World Bank organizes countries in 4 income groups using just the Gross National Income (GNI) per capita

But what if we are interested in health? And in much more variables?

In this case study, we will categorise countries using socio-economic and health indicators that determine the overall development of the world

library(tidyverse)
library(countrycode)
library(rworldmap)

The dataset is obtained through the notebook “WHO_preprocessing”, which creates a file named “WorldData2022.txt”. There you can find explanation for the following World Health Organization (WHO) variables:

Load data and descriptive analysis

WHO = read.table("WorldData2022.txt", sep=",")
names(WHO)[c(5, 8, 9, 10, 11, 12, 13, 15, 16)] <- c("LifeE", "Pop", "AirP", "HR", "HB", "Int", "Phy", "Sch1", "Sch2")
dim(WHO)
## [1] 190  16
str(WHO)
## 'data.frame':    190 obs. of  16 variables:
##  $ COUNTRY  : chr  "AFG" "AGO" "ALB" "AND" ...
##  $ Country  : chr  "Afghanistan" "Angola" "Albania" "Andorra" ...
##  $ Continent: chr  "Asia" "Africa" "Europe" "Europe" ...
##  $ Region   : chr  "South Asia" "Sub-Saharan Africa" "Europe & Central Asia" "Europe & Central Asia" ...
##  $ LifeE    : num  63.2 63.1 78 NA 76.1 ...
##  $ GNI      : int  410 3960 3970 41750 39640 8620 3200 NA 46200 46920 ...
##  $ IMR      : num  44.97 48.34 8.76 2.38 5.62 ...
##  $ Pop      : num  35530 29784 2873 77 9400 ...
##  $ AirP     : num  55.14 38.29 18.07 8.71 44.87 ...
##  $ HR       : num  13.29 3.266 3.56 NA 0.943 ...
##  $ HB       : num  3.9 8 28.9 NA 13.8 ...
##  $ Int      : num  5.45 16.94 54.66 86.43 85 ...
##  $ Phy      : num  2.54 2.14 18.75 33.33 26.01 ...
##  $ HDI      : num  0.478 0.586 0.796 0.858 0.911 0.842 0.759 0.788 0.951 0.916 ...
##  $ Sch1     : num  10.3 12.2 14.4 13.3 15.7 ...
##  $ Sch2     : num  2.99 5.42 11.29 10.56 12.69 ...
head(WHO)
##   COUNTRY              Country Continent                     Region    LifeE
## 1     AFG          Afghanistan      Asia                 South Asia 63.20990
## 2     AGO               Angola    Africa         Sub-Saharan Africa 63.06044
## 3     ALB              Albania    Europe      Europe & Central Asia 78.00018
## 4     AND              Andorra    Europe      Europe & Central Asia       NA
## 5     ARE United Arab Emirates      Asia Middle East & North Africa 76.07952
## 6     ARG            Argentina  Americas  Latin America & Caribbean 76.57514
##     GNI      IMR       Pop  AirP       HR     HB      Int    Phy   HDI     Sch1
## 1   410 44.96735 35530.081 55.14 13.29011  3.900  5.45455  2.538 0.478 10.26384
## 2  3960 48.33796 29784.193 38.29  3.26631  8.000 16.93721  2.140 0.586 12.17210
## 3  3970  8.75843  2873.457 18.07  3.56019 28.893 54.65596 18.754 0.796 14.44800
## 4 41750  2.38052    76.965  8.71       NA     NA 86.43443 33.333 0.858 13.30024
## 5 39640  5.62012  9400.145 44.87  0.94339 13.800 85.00000 26.011 0.911 15.71769
## 6  8620  7.60766 44271.041 12.58 10.44673 49.920 55.80000 40.596 0.842 17.87487
##        Sch2
## 1  2.985070
## 2  5.417391
## 3 11.286455
## 4 10.555120
## 5 12.694030
## 6 11.147269
tail(WHO)
##     COUNTRY      Country Continent                     Region    LifeE  GNI
## 185     VUT      Vanuatu   Oceania        East Asia & Pacific 65.30640 2630
## 186     WSM        Samoa   Oceania        East Asia & Pacific 70.45068 3030
## 187     YEM        Yemen      Asia Middle East & North Africa 66.63147 1160
## 188     ZAF South Africa    Africa         Sub-Saharan Africa 65.25417 6090
## 189     ZMB       Zambia    Africa         Sub-Saharan Africa 62.45290 1070
## 190     ZWE     Zimbabwe    Africa         Sub-Saharan Africa 60.68252  480
##          IMR       Pop  AirP       HR   HB      Int   Phy   HDI     Sch1
## 185 21.06880   276.244 10.50  2.26138   NA 10.59800 1.653 0.607 11.53532
## 186 14.61823   196.440 10.76  2.81582 10.0 12.92249 5.998 0.707 12.41886
## 187 45.70822 28250.420 41.46  9.73575  7.1 17.44650 5.251 0.455  9.09871
## 188 25.77835 56717.156 25.15 35.86099 23.0 41.00000 7.923 0.713 13.64371
## 189 41.66417 17094.130 30.79  6.45656 20.0 13.46820 1.168 0.565 10.92876
## 190 37.92896 16529.904 25.18  5.25154 17.0 17.09080 1.990 0.593 12.11097
##          Sch2
## 185  7.064846
## 186 11.403800
## 187  3.200000
## 188 11.373160
## 189  7.187091
## 190  8.710909
summary(WHO)
##    COUNTRY            Country           Continent            Region         
##  Length:190         Length:190         Length:190         Length:190        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##      LifeE            GNI             IMR              Pop           
##  Min.   :50.75   Min.   :  180   Min.   : 1.542   Min.   :     11.2  
##  1st Qu.:66.59   1st Qu.: 1250   1st Qu.: 5.362   1st Qu.:   2066.7  
##  Median :73.82   Median : 4385   Median :13.507   Median :   8466.0  
##  Mean   :72.60   Mean   :12254   Mean   :20.302   Mean   :  39010.8  
##  3rd Qu.:77.71   3rd Qu.:13762   3rd Qu.:31.289   3rd Qu.:  28833.6  
##  Max.   :84.26   Max.   :86390   Max.   :80.097   Max.   :1386395.0  
##  NA's   :10      NA's   :14      NA's   :2        NA's   :1          
##       AirP             HR                HB              Int       
##  Min.   : 5.73   Min.   : 0.2095   Min.   :  1.00   Min.   : 0.80  
##  1st Qu.:13.48   1st Qu.: 1.2707   1st Qu.: 10.38   1st Qu.:12.84  
##  Median :22.15   Median : 4.0561   Median : 21.00   Median :37.36  
##  Mean   :27.55   Mean   : 8.4071   Mean   : 27.42   Mean   :39.50  
##  3rd Qu.:37.50   3rd Qu.: 9.9503   3rd Qu.: 38.51   3rd Qu.:61.81  
##  Max.   :93.18   Max.   :71.6383   Max.   :129.80   Max.   :96.00  
##  NA's   :3       NA's   :10        NA's   :15       NA's   :6      
##       Phy              HDI              Sch1             Sch2       
##  Min.   : 0.345   Min.   :0.3850   Min.   : 5.543   Min.   : 2.115  
##  1st Qu.: 3.912   1st Qu.:0.5982   1st Qu.:11.575   1st Qu.: 6.234  
##  Median :15.844   Median :0.7350   Median :13.385   Median : 9.365  
##  Mean   :19.311   Mean   :0.7200   Mean   :13.509   Mean   : 8.989  
##  3rd Qu.:30.055   3rd Qu.:0.8317   3rd Qu.:15.591   3rd Qu.:11.540  
##  Max.   :84.199   Max.   :0.9620   Max.   :21.055   Max.   :14.091  
##  NA's   :3

Out of the 200 countries in the world, we have almost complete information (10 variables) from 190 countries:

  • Diverse variables (socio-economic and health)

  • Just a few missing values

The output of the clustering tool depends directly on the selected variables for the input. This selection is subjective and depends on the application’s objective

# SELECT HERE YOUR FAVOURITE VARIABLES
X = WHO %>% dplyr::select(-Continent,-Region,-Country, -COUNTRY,-Pop,-HDI)

Take care: we are going to omit rows with NAs

dim(WHO)[1]
## [1] 190
dim(na.omit(X))[1]
## [1] 165
id.complete = complete.cases(X)
names = WHO$Country[id.complete]
continent = WHO$Continent[id.complete]
HDI = WHO$HDI[id.complete]

X = X[id.complete,]
row.names(X)=names

X$GNI = log(X$GNI)

Multiple scatterplots:

GGally::ggcorr(X, label = T)

Insights?

Clustering

Load important clustering libraries

library(factoextra)
library(cluster)
library(mclust)
# SELECT HERE YOUR INITIAL GUESS FOR THE NUMBER OF CLUSTERS AND RUN KMEANS
k = 5 # (as the number of continents) 
fit = kmeans(scale(X), centers=k, nstart=1000)
groups = fit$cluster
barplot(table(groups), col="blue")

Somehow an unbalanced classification

Interpretation of centers:

centers=fit$centers

barplot(centers[1,], las=2, col="darkblue")

barplot(centers[2,], las=2, col="darkblue")

barplot(centers[3,], las=2, col="darkblue")

barplot(centers[4,], las=2, col="darkblue")

barplot(centers[5,], las=2, col="darkblue")

One small group contains most dangerous countries

One big group seems to contain healthy countries

More insights?

Clusplot

Plot the countries in a 2D PCA graph, adding colors for the groups

# clusplot
fviz_cluster(fit, data = X, geom = c("point"),ellipse.type = 'norm', pointsize=1)+
  theme_minimal()+geom_text(label=names,hjust=0, vjust=0,size=2,check_overlap = F)+scale_fill_brewer(palette="Paired")

But what is the meaning of the first PCA?

pca = prcomp(X, scale=T)
barplot(pca$rotation[,1], las=2, col="darkblue")

It is a measure of global development (positive loads on good indicators and negative ones on the bad variables)

We can also use the kmeans of factoextra through eclust which provides different clustering tools (kmeans, pam, hclust, etc.) and different distances and linkages

fit.kmeans <- eclust(X, "kmeans", stand=TRUE, k=5)

Silhouette plot

d <- dist(scale(X), method="euclidean")  
sil = silhouette(groups, d)
plot(sil, col=1:5, main="", border=NA)

summary(sil)
## Silhouette of 165 units in 5 clusters from silhouette.default(x = groups, dist = d) :
##  Cluster sizes and average silhouette widths:
##        10        46        46        35        28 
## 0.3717895 0.1989841 0.3352958 0.1998968 0.2675338 
## Individual silhouette widths:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.00409  0.15599  0.26807  0.25929  0.35870  0.54160
# the same with factoextra
fviz_silhouette(fit.kmeans)
##   cluster size ave.sil.width
## 1       1   55          0.08
## 2       2   16          0.20
## 3       3   22          0.17
## 4       4   26          0.26
## 5       5   46          0.38

Number of groups?

We can get some hints from silhouette, wss, and GAP.

fviz_nbclust(scale(X), kmeans, method = 'silhouette', k.max = 20, nstart = 1000)

fviz_nbclust(scale(X), kmeans, method = 'wss', k.max = 20, nstart = 1000)

fviz_nbclust(scale(X), kmeans, method = 'gap_stat', k.max = 10, nstart = 100, nboot = 500)

Any insight about the number of groups?

PAM

Partitioning (clustering) of the data into k clusters around medoids

More robust version than k-means, the centers are now countries

fit.pam <- eclust(X, "pam", stand=TRUE, k=5, graph=F)

fviz_cluster(fit.pam, data = X, geom = c("point"), pointsize=1)+
  theme_minimal()+geom_text(label=names,hjust=0, vjust=0,size=2,check_overlap = F)+scale_fill_brewer(palette="Paired")

Number of groups by pam:

fviz_nbclust(scale(X), pam, method = 'silhouette', k.max = 10)

fviz_nbclust(scale(X), pam, method = 'gap_stat', k.max = 10, nboot = 500)

How similar are the clusters?

# Computes the adjusted Rand index comparing two classifications.
# The closer to 1 the more agreement
adjustedRandIndex(fit.kmeans$cluster, fit.pam$clustering) 
## [1] 0.3875913

Somehow similar

Map the clustering in a map:

# Select here your favorite clustering tool
map = data.frame(country=names, value=fit.pam$clustering)
#map = data.frame(country=names, value=fit.kmeans$cluster)

#Convert the country code into iso3c using the function countrycode()
map$country = countrycode(map$country, 'country.name', 'iso3c')
#Create data object supporting the map
matched <- joinCountryData2Map(map, joinCode = "ISO3",
                               nameJoinColumn = "country")
## 165 codes from your data successfully matched countries in the map
## 0 codes from your data failed to match with a country code in the map
## 78 codes from the map weren't represented in your data
#Draw the map
mapCountryData(matched,nameColumnToPlot="value",missingCountryCol = "white",
               borderCol = "#C7D9FF",
               catMethod = "pretty", colourPalette = "topo",
               mapTitle = c("Clusters"), lwd=1)

Let’s see the distribution of the clusters considering the Human Development Index:

as.data.frame(X) %>% mutate(cluster=factor(fit.pam$clustering), names=names, hdi=HDI) %>%
  ggplot(aes(x = cluster, y = hdi)) + 
  geom_boxplot(fill="lightblue") +
  labs(title = "HDI by cluster", x = "", y = "", col = "") 

Kernel k-means

Try to capture clusters that are not linearly separable in input space

library(kernlab)

fit.ker <- kkmeans(as.matrix(X), centers=5, kernel="rbfdot") # Radial Basis kernel (Gaussian)
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
# By default, Gaussian kernel is used
# By default, sigma parameter is estimated

centers(fit.ker)
##          [,1]      [,2]      [,3]     [,4]      [,5]     [,6]     [,7]
## [1,] 69.33320  7.520008 26.156312 27.17640  7.247514 14.81432 21.71877
## [2,] 76.26535  8.941569  7.298259 20.76158  3.773833 46.69458 53.93022
## [3,] 80.43877 10.327634  3.771431 17.40417  2.659286 43.07327 78.69664
## [4,] 64.89866  6.819616 47.042244 57.15433 10.985849  7.96500 11.34092
## [5,] 74.52163  8.451433 15.123712 19.65818 43.583042 23.65392 34.44258
##           [,8]     [,9]     [,10]
## [1,]  7.452440 12.33568  7.381847
## [2,] 31.605184 14.98963 11.175819
## [3,] 39.050028 16.77135 12.196735
## [4,]  3.855633 10.46883  5.136607
## [5,] 16.816727 13.38506  9.001980
size(fit.ker)
## [1] 50 38 36 30 11
withinss(fit.ker)
## [1] 347757.84 435256.44 624091.93 335005.23  83404.31
object.ker = list(data = X, cluster = fit.ker@.Data)
fviz_cluster(object.ker, geom = c("point"), ellipse=F,pointsize=1)+
  theme_minimal()+geom_text(label=names,hjust=0, vjust=0,size=2,check_overlap = T)+scale_fill_brewer(palette="Paired")

Hierarchical clustering

Important to decide distance between observations and linkage to join groups

We need to decide first the distance and linkage

# ADD HERE YOUR CHOICE
d = dist(scale(X), method = "euclidean")
hc <- hclust(d, method = "ward.D2") 

Visualization

Classical dendrogram:

hc$labels <- names

fviz_dend(x = hc, 
          k=5,
          palette = "jco", 
          rect = TRUE, rect_fill = TRUE, 
          rect_border = "jco"          
)

Difficult to visualize the countries

Let’s use a phylogenic tree:

fviz_dend(x = hc,
          k = 5,
          color_labels_by_k = TRUE,
          cex = 0.8,
          type = "phylogenic",
          repel = TRUE)+  labs(title="Socio-economic-health tree clustering of the world") + theme(axis.text.x=element_blank(),axis.text.y=element_blank())

Now in a geographical map

groups.hc = cutree(hc, k = 5)

# Map our PCA index in a map:
map = data.frame(country=names, value=groups.hc)
#Convert the country code into iso3c using the function countrycode()
map$country = countrycode(map$country, 'country.name', 'iso3c')
#Create data object supporting the map
matched <- joinCountryData2Map(map, joinCode = "ISO3",
                               nameJoinColumn = "country")
## 165 codes from your data successfully matched countries in the map
## 0 codes from your data failed to match with a country code in the map
## 78 codes from the map weren't represented in your data
#Draw the map
mapCountryData(matched,nameColumnToPlot="value",missingCountryCol = "white",
               borderCol = "#C7D9FF",
               catMethod = "pretty", colourPalette = "topo",
               mapTitle = c("Clusters"), lwd=1)

EM clustering

Expectation-Maximization clustering is like k-means but computes probabilities of cluster memberships based on probability distributions

Hence, the goal of the EM clustering then is to maximize the overall likelihood of the data, given the (final) clusters

res.Mclust <- Mclust(scale(X))
summary(res.Mclust)
## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VVE (ellipsoidal, equal orientation) model with 5 components: 
## 
##  log-likelihood   n  df       BIC       ICL
##       -994.4194 165 149 -2749.625 -2755.747
## 
## Clustering table:
##  1  2  3  4  5 
## 47 37 30 25 26
# The clustering is probabilistic: for each country we don't have a unique group but the probabilities the country belongs to each of the groups
head(res.Mclust$z)
##                              [,1]         [,2]         [,3]         [,4]
## Afghanistan          1.000000e+00 1.305360e-09 3.001811e-10 5.616534e-30
## Angola               1.000000e+00 2.129092e-08 1.797981e-13 1.423275e-35
## Albania              3.021922e-13 1.446955e-01 2.047918e-03 8.532566e-01
## United Arab Emirates 7.074887e-24 9.999985e-01 4.134297e-11 1.491488e-06
## Argentina            1.390366e-63 4.327390e-07 9.999995e-01 2.394496e-08
## Armenia              2.531043e-84 2.904024e-07 3.866272e-03 9.961334e-01
##                               [,5]
## Afghanistan           0.000000e+00
## Angola                3.757021e-50
## Albania               1.916734e-11
## United Arab Emirates  3.973305e-36
## Argentina            2.371902e-211
## Armenia               7.493723e-11
# Of course the tool assign the group with highest probability  
head(res.Mclust$classification)
##          Afghanistan               Angola              Albania 
##                    1                    1                    4 
## United Arab Emirates            Argentina              Armenia 
##                    2                    3                    4
fviz_mclust(object = res.Mclust, what = "BIC", pallete = "jco") +
  scale_x_discrete(limits = c(1:10))

5 groups is ok

Clusplot

fviz_mclust(object = res.Mclust, what = "classification", geom = "point",
            pallete = "jco")

How similar are the clusters?

# Computes the adjusted Rand index comparing two classifications.
# The closer to 1 the more agreement
adjustedRandIndex(res.Mclust$classification, fit.pam$clustering) 
## [1] 0.5250883
adjustedRandIndex(res.Mclust$classification, groups.hc) 
## [1] 0.4936775

Insights?

Visualization in a map

groups.mclust = res.Mclust$classification

# Map our PCA index in a map:
map = data.frame(country=names, value=groups.mclust)
#Convert the country code into iso3c using the function countrycode()
map$country = countrycode(map$country, 'country.name', 'iso3c')
#Create data object supporting the map
matched <- joinCountryData2Map(map, joinCode = "ISO3",
                               nameJoinColumn = "country")
## 165 codes from your data successfully matched countries in the map
## 0 codes from your data failed to match with a country code in the map
## 78 codes from the map weren't represented in your data
#Draw the map
mapCountryData(matched,nameColumnToPlot="value",missingCountryCol = "white",
               borderCol = "#C7D9FF",
               catMethod = "pretty", colourPalette = "topo",
               mapTitle = c("Clusters"), lwd=1)

Heatmaps

A heat map is a false color image (based on data frame X) with a dendrogram added to the left side and to the top

heatmap(scale(X), scale = "none",
        distfun = function(x){dist(x, method = "euclidean")},
        hclustfun = function(x){hclust(x, method = "ward.D2")},
        cexRow = 0.7)